Author: Linwood Creekmore
Email: valinvescap@gmail.com
In [ ]:
import re
import pytz
import gdelt
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import geoplot as gplt
from tzwhere import tzwhere
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
tz1 = tzwhere.tzwhere(forceTZ=True)
gdeltPyR
It's easy to set up gdeltPyR
. This single line gets us ready to query. See the github project page for details on accessing other tables and setting other parameters. Then, we just pass in a date to pull the data. It's really that simple. The only concern, is memory. Pulling multiple days of GDELT can consume lots of memory. Make a workflow to pull and write the disc if you have issues.
In [ ]:
gd = gdelt.gdelt()
%time vegas = gd.Search(['Oct 1 2017','Oct 2 2017'],normcols=True,coverage=True)
In [ ]:
def striptimen(x):
"""Strip time from numpy array or list of dates that are integers"""
date = str(int(x))
n = np.datetime64("{}-{}-{}T{}:{}:{}".format(date[:4],date[4:6],date[6:8],date[8:10],date[10:12],date[12:]))
return n
def timeget(x):
'''convert to datetime object with UTC time tag'''
try:
now_aware = pytz.utc.localize(x[2].to_pydatetime())
except:
pass
# get the timezone string representation using lat/lon pair
try:
timezone_str=tz1.tzNameAt(x[0],x[1],forceTZ=True)
# get the time offset
timezone = pytz.timezone(timezone_str)
# convert UTC to calculated local time
aware = now_aware.astimezone(timezone)
return aware
except Exception as e:
pass
# vectorize our two functions
vect = np.vectorize(striptimen)
vect2=np.vectorize(timeget)
Now we apply the functions to create a datetime object column (dates
) and a timezone aware column (datezone
).
In [ ]:
# vectorize our function
vect = np.vectorize(striptimen)
# use custom functions to build time enabled columns of dates and zone
vegastimed = (vegas.assign(
dates=vect(vegas.dateadded.values)).assign(
zone=list(timeget(k) for k in vegas.assign(
dates=vect(vegas.dateadded.values))\
[['actiongeolat','actiongeolong','dates']].values)))
I return data in pandas dataframes
to leverage the power of pandas data manipulation. Now we filter our data on the two target fields; actiongeofeatureid and eventrootcode. To learn more about the columns, see this page with descriptions for each header.
In [ ]:
# filter to data in Las Vegas and about violence/fighting/mass murder only
vegastimedfil=(vegastimed[
((vegas.eventrootcode=='19') |
(vegas.eventrootcode=='20') |
(vegas.eventrootcode=='18')) &
(vegas.actiongeofeatureid=='847388')])\
.drop_duplicates('sourceurl')
print(vegastimedfil.shape)
In [ ]:
# lazy meta-character regex; more elegant
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
In [ ]:
# build the chronological news stories and show the first few rows
print(vegastimedfil.set_index('zone')[['dates','sourceurl']].head())
To time enable the entire dataset, it's a fairly simple task.
In [ ]:
# example of converting to Los Angeles time.
vegastimed.set_index(
vegastimed.dates.astype('datetime64[ns]')
).tz_localize(
'UTC'
).tz_convert(
'America/Los_Angeles'
)
In [ ]:
# regex to strip a url from a string; should work on any url (let me know if it doesn't)
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
# apply regex to each url; strip provider; assign as new column
print(vegastimedfil.assign(provider=vegastimedfil.sourceurl.\
apply(lambda x: s.search(x).group() if s.search(x) else np.nan))\
.groupby(['provider']).size().sort_values(ascending=False).reset_index().rename(columns={0:"count"}).head())
In [ ]:
# chained operation to return shape
vegastimedfil.assign(provider=vegastimedfil.sourceurl.\
apply(lambda x: s.search(x).group() if \
s.search(x) else np.nan))['provider']\
.value_counts().shape
Understanding how many providers we have producing, it would be a good idea to understand the distribution of production. Or, we want to see how many articles each provider published. We use a distribution and cumulative distribution plot.
In [ ]:
# make plot canvas
f,ax = plt.subplots(figsize=(15,5))
# set title
plt.title('Distributions of Las Vegas Active Shooter News Production')
# ckernel density plot
sns.kdeplot(vegastimedfil.assign(provider=vegastimedfil.sourceurl.\
apply(lambda x: s.search(x).group() if s.search(x) else np.nan))['provider']\
.value_counts(),bw=0.4,shade=True,label='No. of articles written',ax=ax)
# cumulative distribution plot
sns.kdeplot(vegastimedfil.assign(provider=vegastimedfil.sourceurl.\
apply(lambda x: s.search(x).group() if s.search(x) else np.nan))['provider']\
.value_counts(),bw=0.4,shade=True,label='Cumulative',cumulative=True,ax=ax)
# show it
plt.show()
In [ ]:
timeseries = pd.concat([vegastimed.set_index(vegastimed.dates.astype('datetime64[ns]')).tz_localize('UTC').tz_convert('America/Los_Angeles').resample('15T')['sourceurl'].count(),vegastimedfil.set_index('zone').resample('15T')['sourceurl'].count()]
,axis=1)
# file empty event counts with zero
timeseries.fillna(0,inplace=True)
# rename columns
timeseries.columns = ['Total Events','Las Vegas Events Only']
# combine
timeseries = timeseries.assign(Normalized=(timeseries['Las Vegas Events Only']/timeseries['Total Events'])*100)
# make the plot
f,ax = plt.subplots(figsize=(13,7))
ax = timeseries.Normalized.ewm(adjust=True,ignore_na=True,min_periods=10,span=20).mean().plot(color="#C10534",label='Exponentially Weighted Count')
ax.set_title('Reports of Violent Events Per 15 Minutes in Vegas',fontsize=28)
for label in ax.get_xticklabels():
label.set_fontsize(16)
ax.set_xlabel('Hour of the Day', fontsize=20)
ax.set_ylabel('Percentage of Hourly Total',fontsize='15')
ax.legend()
plt.tight_layout()
plt.show()
In [ ]:
# complex, chained operations to perform all steps listed above
print((((vegastimedfil.reset_index().assign(provider=vegastimedfil.reset_index().sourceurl.\
apply(lambda x: s.search(x).group() if s.search(x) else np.nan),\
epochzone=vegastimedfil.set_index('dates')\
.reset_index()['dates']\
.apply(lambda x: (x.to_pydatetime().timestamp()))).groupby('provider')\
.filter(lambda x: len(x)>=10).groupby('provider').agg([np.mean,np.max,np.min,np.median])\
.sort_index(level='median',ascending=False)['epochzone']['median'])\
.apply(lambda x:datetime.datetime.fromtimestamp(int(x)))\
.sort_values(ascending=True)).reset_index()\
.set_index('median',drop=False)).tz_localize('UTC')\
.tz_convert('America/Los_Angeles'))
In [ ]:
# Author: Linwood Creekmore
# Email: valinvescap@gmail.com
# Description: Python script to pull content from a website (works on news stories).
# Notes
"""
23 Oct 2017: updated to include readability based on PyCon talk: https://github.com/DistrictDataLabs/PyCon2016/blob/master/notebooks/tutorial/Working%20with%20Text%20Corpora.ipynb
"""
###################################
# Standard Library imports
###################################
import re
from io import BytesIO
###################################
# Third party imports
###################################
import requests
import numpy as np
from bs4 import BeautifulSoup
from readability.readability import Document as Paper
# placeholder dictionary to keep track of what's been completed
done ={}
def textgetter(url):
"""Scrapes web news and returns the content
Parameters
----------
url : str
Address to news report
newstext: str
Returns all text in the "p" tag. This usually is the content of the news story.
"""
global done
TAGS = [
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li'
]
# regex for url check
s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
answer = {}
# check that its an url
if s.search(url):
if url in done.keys():
return done[url]
pass
else:
r = requests.get(url)
if r.status_code != 200:
done[url]="Unable to reach website."
answer['base']=s.search(url).group()
answer['url']=url
answer['text']="Unable to reach website."
answer['title']=''
yield answer
doc = Paper(r.content)
data = doc.summary()
title = doc.title()
soup = BeautifulSoup(data,'lxml')
newstext = " ".join([l.text for l in soup.find_all(TAGS)])
del r,data
if len(newstext)>200:
answer['base']=s.search(url).group()
answer['text']=newstext
answer['url']=url
answer['title']=title
yield answer
else:
newstext = " ".join([l.text for l in soup.find_all('div',class_='field-item even')])
done[url]=newstext
if len(newstext)>200:
answer['url']=url
answer['base']=s.search(url).group()
answer['text']=newstext
answer['title']=""
yield answer
else:
answer['url']=url
answer['base']=s.search(url).group()
answer['text']='No text returned'
answer['title']=""
yield answer
else:
answer['text']='This is not a proper url'
answer['url']=url
answer['base']=''
answer['title']=""
yield answer
In [ ]:
# create vectorized function
vect = np.vectorize(textgetter)
#vectorize the operation
cc = vect(vegastimedfil['sourceurl'].values[10:25])
#Vectorized opp
dd = list(next(l) for l in cc)
# the output
pd.DataFrame(dd).head(5)